*This file converts raw claims data extracts into .dta files and does a preliminary cleaning of the key variables for the analysis.

********************************************************************************
*DEFINE DIRECTORIES
local home CHILD
local main CHILD/JPE
local logs CHILD/JPE/logs
local data CHILD/JPE/data
local results CHILD/JPE/results
local network NETWORK
local drugs DRUGS
local fda FDA
********************************************************************************

*bring in data***************************************************************
cd "`home'"
import delimited ccc_pull_claims.txt, varnames(1) clear
generate date = mofd(mdy(dm,1,dy))
format date %tm
drop dm dy
order patientid date, first
*hard code total_spend (only $0.25 off at max because of rounding)
replace total_spend = mh_spend+non_mh_spend
save ccc_pull_claims.dta, replace

*bring in data***************************************************************
cd "`home'"
import delimited ccc_pull_coverage.txt, varnames(1) clear
generate date = mofd(date(begin_dt,"YMD"))
format date %tm
drop begin_dt
generate first_mh = date(earliest,"YMD")
format first_mh %td
drop earliest
order patientid date months_after, first

*test whether gaps in coverage
duplicates drop patientid date mbrs_crnt_prim_zip_cd, force

*test for duplicates within member-month
by patientid date, s: generate duplicates = _N
by patientid: egen max = max(months_after_diagnosis)
replace max = max==months_after_diagnosis
replace mbrs_crnt="" if real(mbrs_crnt)==.
destring mbrs_crnt, replace
generate end_zip = mbrs_crnt*max
generate flag = duplicates>1 & max==1
by patientid: egen flag2 = sum(flag)
replace flag2=1 if flag2>1

replace mbrs_crnt=end_zip if duplicates>1 & flag2==0
drop flag flag2 max end_zip duplicates

by patientid date months_after_diagnosis mbrs_crnt, s: keep if _n==1
by patientid date: generate duplicates = _N
*now, there are 0.34% of duplicates
by patientid date: keep if _n==1

*now, check for gaps
tsset patientid date
generate run = .
by patientid,s: replace run = cond(L.run==., 1, L.run+1)
by patientid: egen maxrun=max(run)
generate first = run==1
by patientid: egen test =sum(first)
keep if test==1
drop test run maxrun duplicates first

*kids are left = 656,881
unique patientid
save ccc_pull_coverage.dta, replace

*bring in data***************************************************************
cd "`home'"
import delimited ccc_pull_hospital.txt, varnames(1) clear
tempfile hospital
save `hospital', replace

*bring in data
cd "`home'"
import delimited ccc_pull_er.txt, varnames(1) clear
tempfile er
save `er', replace

*bring in data
cd "`home'"
import delimited ccc_pull_evaluation.txt, varnames(1) clear
generate first_eval=1
tempfile eval
save `eval', replace

*bring in data***************************************************************
cd "`home'"
import delimited ccc_pull_demographics.txt, varnames(1) clear

merge m:1 patientid using `hospital'
drop _merge
merge m:1 patientid using `er'
drop _merge
merge m:1 patientid using `eval'
drop _merge

*hard code zeros
foreach var of varlist first_hospital first_er first_eval {
replace `var'=0 if `var'==.
}

*fix up date
generate bday = date(memb_brth_dt,"YMD")
format bday %td
generate female = memb_gender_cd=="F"
replace female=. if memb_gender_cd=="U"
drop memb_brth_dt memb_gender_cd

*look for people with contrasting records
*TO START: have 824,577 kids
duplicates drop patientid bday female, force
by patientid, s: generate duplicates = _N
*there are 4,527 with contrasting gender/birthdays
drop if duplicates>1
*we are left with 820,050 kids
drop patientid0 duplicates

*save combo file
save ccc_pull_demographics.dta, replace

*bring in data***************************************************************
cd "`home'"
import delimited ccc_pull_mhdrugs.txt, varnames(1) clear

*format date
generate date = date(serv_dt,"YMD")
format %td date
drop serv_dt

*fix up product code
tostring prod_serv_id_cd, gen(ndc11) format(%011.0f)
generate ndc9 = substr(ndc11,1,9)
drop ndc11 prod_serv_id_cd

*identify the drug
rename ndc9 ndc
preserve
cd "`drugs'"
import excel ndc_data_info.xlsx, firstrow clear
generate ndc = substr(NDC,1,9)
destring ndc, replace
by ndc, s: keep if _n==1
drop NDC
tempfile temp
save `temp', replace
restore

cd "`home'"
destring ndc, replace
merge m:1 ndc using `temp'
drop if _merge==2
order ndc DrugName-GPI_14_DESC, first
drop _merge

preserve
cd "`fda'"
import delimited product.txt, varnames(1) clear

generate flag = strpos(productndc,"-")
generate ndc9 = substr(productndc,1,flag-1)
generate length = length(ndc9)
replace ndc9 = "0"+ndc9 if length==4
replace length = length(substr(productndc,flag+1,.))
replace ndc9 = ndc9 + cond(length==4,substr(productndc,flag+1,.), "0" + substr(productndc,flag+1,.))
destring ndc9, replace
rename ndc9 ndc

by ndc, s: keep if _n==1

tempfile temp
save `temp', replace

restore

cd "`home'"
merge m:1 ndc using `temp'
drop if _merge==2
drop _merge

replace GPI_08=lower(GPI_08)
replace GPI_14=lower(GPI_14)
replace proprietaryname=lower(proprietaryname)
replace nonproprietaryname=lower(nonproprietaryname)
replace substancename=lower(substancename)

rename GPI_08 active1
rename GPI_14 active2
rename proprietaryname active3
rename nonproprietaryname active4
rename substancename active5

*now flag mental health drugs
local drugs aripiprazole asenapine bromazepam brexpiprazole cariprazine chlorpromazine chlorprothixene clozapine fluphenazine haloperidol iloperidone loxapine lurasidone mesoridazine methotrimeprazine molindone olanzapine paliperidone periciazine perphenazine pimavanserin pimozide promazine quetiapine risperidone sulpiride thioridazine thiothixene trifluoperazine ziprasidone zuclopenthixol agomelatine amitriptyline amoxapine brexanolone bupropion citalopram clomipramine desipramine desvenlafaxine doxepin duloxetine escitalopram esketamine fluoxetine fluvoxamine imipramine isocarboxazid levomilnacipran maprotiline mirtazapine nefazodone nortriptyline phenelzine proptriptyline reboxetine selegiline sertraline tranylcypromine trazodone trimipramine vilazodone vortioxetine alprazolam buspirone chlordiazepoxide clorazepate diazepam halazepam lorazepam midazolam oxazepam paroxetine prochlorperazine venlafaxine bromazepam eszopiclone hydroxyzine meprobamate metoprolol paroxetine pregabalin propranolol lithium divalproex lamotrigine carbamazepine prazepam quazepam temazepam triazolam clonazepam estazolam flurazepam halazepam nitrazepam

generate mh = 0
*recode mh variable
foreach drug of local drugs {
forvalues i = 1(1)5 {
replace mh = 1 if regexm(active`i',"`drug'")
}
}
*assert mh==1 
*yes, all are mental health drugs

*parse variables
keep ndc active1 active2 active3 active4 active5 patientid days_supply date mh
drop active2 active3 active5

*stretch prescription

*generate end of prescription date
generate end_date = date+days_supply
format end_date %td
replace end_date = mdy(12,31,2018) if end_date>mdy(12,31,2018)

*convert to monthly drug fill records
generate months = mofd(end_date)-mofd(date)+1
expand months
sort patientid date 

by patientid date, s: generate id = _n
generate month = mofd(date) + id - 1
format month %tm

*delete overlapping drug fill months
by patientid month, s: keep if _n==1
drop ndc days_supply_cnt date mh end_date months polypharmacy id
drop active1 active4

*fix variables
rename month date

*generate i_ad this month variable
generate i_drug=1
 
*label variable
label variable i_drug "DV: Active MH script"

*save file
save ccc_pull_mhdrugs.dta, replace

*bring in hospital data***************************************************************
cd "`home'"
import delimited ccc2_monthly.txt, varnames(1) clear

*keep only true hospital visits
keep if i_er + i_hospital>0
drop i_er i_hospital

*generate date
generate date = mofd(mdy(dm,1,dy))
format date %tm
drop dm dy

*parse variables
drop i_*

rename mh_spend hospital_mh
rename non_mh_spend hospital_nonmh
rename total_spend hospital

*save file
save ccc2_monthly.dta, replace

*********************************************************************length of time in facility

*bring in hospital length
cd "`home'"
import delimited ccc2_temp2_length.txt, varnames(1) clear

*fix dates
generate begin = date(first_serv_dt,"YMD")
generate end = date(last_serv_dt,"YMD")
format begin end %td
drop first last

generate time_in = end-begin

generate date = mofd(begin)
format date %tm

collapse (sum) time_in, by(patientid date)

save time_in_facility.dta, replace

*pull data together***************************************************************

*backbone
use ccc_pull_coverage.dta, clear

*keep the consistent IDs
merge m:1 patientid using ccc_pull_demographics.dta
keep if _merge==3
drop _merge
*N=654,186

*merge in claims
merge 1:1 patientid date using ccc_pull_claims.dta
drop if _merge==2
drop _merge

*merge in MH drug flag
merge 1:1 patientid date using ccc_pull_mhdrugs.dta
drop if _merge==2
drop _merge

*merge in hospital-only costs
merge 1:1 patientid date using ccc2_monthly.dta
drop if _merge==2
drop _merge

*hard code the zeros
foreach var of varlist mh_spend-hospital {
replace `var'=0 if `var'==.
}

*keep only those who are first diagnosed after age 10
generate age_at_diagnosis = mofd(first_mh)-mofd(bday)
keep if age_at_diagnosis>=120

*merge in time in facility
merge 1:1 patientid date using time_in_facility.dta
drop if _merge==2
drop _merge

*save the final file
save full_data.dta, replace
